library(cleaningtools)
library(dplyr)
my_raw_dataset <- cleaningtools::cleaningtools_raw_data
my_kobo_survey <- cleaningtools::cleaningtools_survey
my_kobo_choice <- cleaningtools::cleaningtools_choices02 - R framework with IMPACT - session 2
Recap
more_logs <- my_raw_dataset %>%
check_duplicate(uuid_column = "X_uuid") %>%
check_soft_duplicates(uuid_column = "X_uuid", kobo_survey = my_kobo_survey, sm_separator = ".") %>%
check_outliers(uuid_column = "X_uuid") %>%
check_value(uuid_column = "X_uuid")
more_logs$checked_dataset <- more_logs$checked_dataset %>%
add_duration(uuid_column = "X_uuid", start_column = "X.U.FEFF.start", end_column = "end")
more_logs <- more_logs %>%
check_duration(column_to_check = "duration", uuid_column = "X_uuid")
other_columns_to_check <- my_kobo_survey %>%
filter(type == "text") %>%
filter(name %in% names(my_raw_dataset)) %>%
pull(name)
more_logs <- more_logs %>%
check_others(uuid_column = "X_uuid", columns_to_check = other_columns_to_check)
logical_check_list <- readxl::read_excel("inputs/01 - example - check_list.xlsx")
more_logs <- more_logs %>%
check_logical_with_list(uuid_column = "X_uuid",
list_of_check = logical_check_list,
check_id_column = "check_id",
check_to_perform_column = "check_to_perform",
columns_to_clean_column = "columns_to_clean",
description_column = "description")Creating a cleaning log
create_combined_log
create_combined_log will combined all the logs from a list into one. It will also add 2 columns that will be used for the cleaning.
names(more_logs)[1] "checked_dataset" "duplicate_log" "soft_duplicate_log"
[4] "potential_outliers" "flaged_value" "duration_log"
[7] "other_log" "logical_all"
my_combined_log <- create_combined_log(more_logs)List of element to combine- checked_dataset, duplicate_log, soft_duplicate_log, potential_outliers, flaged_value, duration_log, other_log, logical_all
typeof(my_combined_log)[1] "list"
names(my_combined_log)[1] "checked_dataset" "cleaning_log"
my_combined_log$cleaning_log %>%
head()| uuid | old_value | question | issue | check_id | check_binding | change_type | new_value |
|---|---|---|---|---|---|---|---|
| b5b1d37a-e27a-4c35-a0f5-2cde9d6dfd06 | 86 | age_respondent_r | outlier (normal distribution) | NA | age_respondent_r / b5b1d37a-e27a-4c35-a0f5-2cde9d6dfd06 | NA | NA |
| 956b5ed0-5a62-41b7-aec3-af93fbc5b494 | 84 | age_respondent_r | outlier (normal distribution) | NA | age_respondent_r / 956b5ed0-5a62-41b7-aec3-af93fbc5b494 | NA | NA |
| 97ad6294-30c6-454e-a0b3-42126415b767 | 18 | age_respondent_r | outlier (log distribution) | NA | age_respondent_r / 97ad6294-30c6-454e-a0b3-42126415b767 | NA | NA |
| e005e719-57c4-44a3-ac2f-5d6d1ff68831 | 18 | age_respondent_r | outlier (log distribution) | NA | age_respondent_r / e005e719-57c4-44a3-ac2f-5d6d1ff68831 | NA | NA |
| c9aaa542-118f-4e42-93de-fb0916572541 | 19 | num_hh_member | outlier (normal distribution) | NA | num_hh_member / c9aaa542-118f-4e42-93de-fb0916572541 | NA | NA |
| 48e8896b-d1be-4600-8839-2d8b994ebcfb | 19 | num_hh_member | outlier (normal distribution) | NA | num_hh_member / 48e8896b-d1be-4600-8839-2d8b994ebcfb | NA | NA |
The cleaning log contains all the columns from all the logs from more_logs with in addition:
- check_binding is filled for all rows.
- change_type (empty)
- new_value (empty)
add_info_to_cleaning_log
If more information from the dataset should be added, the function add_info_to_cleaning_log can help.
my_combined_log <- my_combined_log %>%
add_info_to_cleaning_log(dataset_uuid_column = "X_uuid",
information_to_add = "enumerator_num")
my_combined_log$cleaning_log %>%
head()| uuid | old_value | question | issue | check_id | check_binding | change_type | new_value | enumerator_num |
|---|---|---|---|---|---|---|---|---|
| 019bc718-c06a-46b8-bba8-c84f6c6efbd5 | لا اعلم | water_supply_other_neighbourhoods_why | recode other | NA | water_supply_other_neighbourhoods_why / 019bc718-c06a-46b8-bba8-c84f6c6efbd5 | NA | NA | 12 |
| 019bc718-c06a-46b8-bba8-c84f6c6efbd5 | لا ارى جدوى من ذلك | prefer_not_engage_other | recode other | NA | prefer_not_engage_other / 019bc718-c06a-46b8-bba8-c84f6c6efbd5 | NA | NA | 12 |
| 019bc718-c06a-46b8-bba8-c84f6c6efbd5 | السلطات ليست مهتمة بالخدمات | trust_water_office_why_not | recode other | NA | trust_water_office_why_not / 019bc718-c06a-46b8-bba8-c84f6c6efbd5 | NA | NA | 12 |
| 019bc718-c06a-46b8-bba8-c84f6c6efbd5 | 247.20 | duration | Duration is lower or higher than the thresholds | NA | duration / 019bc718-c06a-46b8-bba8-c84f6c6efbd5 | NA | NA | 12 |
| 03183d24-0275-43fe-8976-d076f29de590 | عدم توفير خدمه المياه في المنطقه وضعف في تزويد خدمه وعدد ساعات اقل تجهيز للبيوت | water_supply_other_neighbourhoods_why | recode other | NA | water_supply_other_neighbourhoods_why / 03183d24-0275-43fe-8976-d076f29de590 | NA | NA | 2 |
| 03183d24-0275-43fe-8976-d076f29de590 | 10 | pay_water_charges_amount | outlier (log distribution) | NA | pay_water_charges_amount / 03183d24-0275-43fe-8976-d076f29de590 | NA | NA | 2 |
create_xlsx_cleaning_log
create_xlsx_cleaning_log(my_combined_log,
sm_dropdown_type = "logical",
output_path = "outputs/01 - example - cleaning-log-no-kobo.xlsx")create_xlsx_cleaning_log will write an excel file with:
- checked_dataset tab: the checked dataset, with additional columns if any.
- cleaning_log tab: the combined log with the change_type column with a data validation rules.
- readme tab: change_type values definition.
There are 4 actions possible:
- change_response: Change the response to new.value
- blank_response: Remove and NA the response
- remove_survey: Delete the survey
- no_action: No action to take.
This log will have to be filled in with actions to take and new value if needed.
create_xlsx_cleaning_log(my_combined_log,
kobo_survey = my_kobo_survey,
kobo_choices = my_kobo_choice,
use_dropdown = T,
sm_dropdown_type = "logical",
output_path = "outputs/02 - example - cleaning-log-with-kobo.xlsx")If the KOBO information are provided and the use_dropdown argument is set to TRUE, new_value will have a data validation rule based on the KOBO options.
Select multiple dummy columns (TRUE/FALSE or 1/0) are flagged and used later for the cleaning, not the parent column.
Creating a clean dataset
The cleaning has to be filled before moving forward. The above steps are to create the cleaning log, not to fill it or clean the dataset. Filling the cleaning log is not an automatic step for this process. The decision to change a value has to be recorded.
review_cleaning_log
review_cleaning_log will review the filled cleaning log.
my_filled_log <- readxl::read_excel("inputs/02 - example - cleaning-log-with-kobo - filled.xlsx", sheet = 2)
check_log_results <- review_cleaning_log(raw_dataset = my_raw_dataset,
raw_data_uuid_column = "X_uuid",
cleaning_log = my_filled_log,
cleaning_log_uuid_column = "uuid",
cleaning_log_question_column = "question",
cleaning_log_new_value_column = "new_value",
cleaning_log_change_type_column = "change_type",
change_response_value = "change_response")
check_log_results[1] "no issues in cleaning log found"
create_clean_data
To create the clean dataset, create_clean_data will use the raw dataset and the filled cleaning log.
my_clean_data <- create_clean_data(raw_dataset = my_raw_dataset,
raw_data_uuid_column = "X_uuid",
cleaning_log = my_filled_log,
cleaning_log_uuid_column = "uuid",
cleaning_log_question_column = "question",
cleaning_log_new_value_column = "new_value",
cleaning_log_change_type_column = "change_type")[1] "water_supply_other_neighbourhoods_why"
[1] "trust_water_office_why_not"
[1] "pay_water_charges_amount"
[1] "connection_fees_amount"
[1] "connection_fees_amount"
[1] "primary_livelihood.employment"
[1] "primary_livelihood.employment"
[1] "primary_livelihood.employment"
[1] "tank_emptied"
[1] "access_water_enough"
recreate_parent_column
In the cleaning log, some select multiple are changed, but only the dummy. The parent column should be recoded also.
my_filled_log %>%
filter(question == "primary_livelihood.employment",
change_type == "change_response") %>%
select(uuid, question, old_value, new_value)| uuid | question | old_value | new_value |
|---|---|---|---|
| eb3b9935-fa0b-4d54-8058-3b629f1421ad | primary_livelihood.employment | TRUE | FALSE |
| eec1f630-15d5-475e-a344-32bba74b32ea | primary_livelihood.employment | TRUE | FALSE |
| f1175d29-ce1f-43a7-b3d1-ee26cd1b8cdb | primary_livelihood.employment | TRUE | FALSE |
my_clean_data %>%
filter(X_uuid %in% c("eb3b9935-fa0b-4d54-8058-3b629f1421ad", "eec1f630-15d5-475e-a344-32bba74b32ea", "f1175d29-ce1f-43a7-b3d1-ee26cd1b8cdb")) %>%
select(X_uuid,primary_livelihood, primary_livelihood.employment)| X_uuid | primary_livelihood | primary_livelihood.employment |
|---|---|---|
| f1175d29-ce1f-43a7-b3d1-ee26cd1b8cdb | employment | FALSE |
| eb3b9935-fa0b-4d54-8058-3b629f1421ad | employment ngo | FALSE |
| eec1f630-15d5-475e-a344-32bba74b32ea | employment | FALSE |
my_clean_data2 <- recreate_parent_column(dataset = my_clean_data,
uuid_column = "X_uuid",
kobo_survey = my_kobo_survey,
kobo_choices = my_kobo_choice,
sm_separator = ".",
cleaning_log_to_append = my_filled_log)my_clean_data2$data_with_fix_concat %>%
filter(X_uuid %in% c("eb3b9935-fa0b-4d54-8058-3b629f1421ad", "eec1f630-15d5-475e-a344-32bba74b32ea", "f1175d29-ce1f-43a7-b3d1-ee26cd1b8cdb")) %>%
select(X_uuid,primary_livelihood, primary_livelihood.employment)| X_uuid | primary_livelihood | primary_livelihood.employment |
|---|---|---|
| f1175d29-ce1f-43a7-b3d1-ee26cd1b8cdb | NA | FALSE |
| eb3b9935-fa0b-4d54-8058-3b629f1421ad | ngo | FALSE |
| eec1f630-15d5-475e-a344-32bba74b32ea | NA | FALSE |
my_clean_data2$cleaning_log %>%
filter(question == "primary_livelihood",
uuid %in% c("eb3b9935-fa0b-4d54-8058-3b629f1421ad", "eec1f630-15d5-475e-a344-32bba74b32ea", "f1175d29-ce1f-43a7-b3d1-ee26cd1b8cdb")) %>%
select(uuid, question, old_value, new_value)| uuid | question | old_value | new_value |
|---|---|---|---|
| eb3b9935-fa0b-4d54-8058-3b629f1421ad | primary_livelihood | employment ngo | ngo |
| eec1f630-15d5-475e-a344-32bba74b32ea | primary_livelihood | employment | NA |
| f1175d29-ce1f-43a7-b3d1-ee26cd1b8cdb | primary_livelihood | employment | NA |
review_others
In the cleaning log, some opentext values are changed to blank. Some open text questions are linked some skip logic, i.e. what is X? Other, please specify. In some cases, values some values should be changed.
In the example below, the value for water_supply_other_neighbourhoods_why for the uuid 019bc718-c06a-46b8-bba8-c84f6c6efbd5 was changed to NA.
my_filled_log %>%
filter(question == "water_supply_other_neighbourhoods_why",
change_type == "blank_response")| uuid | old_value | question | issue | check_id | check_binding | change_type | new_value | enumerator_num |
|---|---|---|---|---|---|---|---|---|
| 019bc718-c06a-46b8-bba8-c84f6c6efbd5 | لا اعلم | water_supply_other_neighbourhoods_why | recode other | NA | water_supply_other_neighbourhoods_why / 019bc718-c06a-46b8-bba8-c84f6c6efbd5 | blank_response | NA | 12 |
The kobo show a skip logic based on water_supply_other_neighbourhoods.
my_kobo_survey %>%
filter(name == "water_supply_other_neighbourhoods_why") %>%
select(type, name, relevant)| type | name | relevant |
|---|---|---|
| text | water_supply_other_neighbourhoods_why | selected(\({water_supply_other_neighbourhoods},'somewhat_worse') or selected(\){water_supply_other_neighbourhoods},‘much_worse’) |
my_clean_data %>%
filter(X_uuid == "019bc718-c06a-46b8-bba8-c84f6c6efbd5") %>%
select(water_supply_other_neighbourhoods, water_supply_other_neighbourhoods_why )| water_supply_other_neighbourhoods | water_supply_other_neighbourhoods_why |
|---|---|
| somewhat_worse | NA |
Should the value of water_supply_other_neighbourhoods be changed? It depends on the question and skip logic but it important to flag those so a decision can be taken.
review_other_log <- review_others(dataset = my_clean_data2$data_with_fix_concat,
uuid_column = "X_uuid",
kobo_survey = my_kobo_survey,
columns_not_to_check = "consent_telephone_number")Warning in create_logic_for_other(kobo_survey = kobo_survey,
compare_with_dataset = TRUE, : The following parent names: well_quality,
spring_quality, rainwater_quality, surface_quality, why_not_connected were not
found in the dataset. The function is ignoring them.
review_cleaning
my_deletion_log <- my_clean_data2$cleaning_log %>%
filter(change_type == "remove_survey")
my_filled_log_no_deletion <- my_clean_data2$cleaning_log %>%
filter(change_type != "remove_survey") %>%
filter(!uuid %in% my_deletion_log$uuid)
review_of_cleaning <- review_cleaning(raw_dataset = my_raw_dataset,
raw_dataset_uuid_column = "X_uuid",
clean_dataset = my_clean_data2$data_with_fix_concat,
clean_dataset_uuid_column = "X_uuid",
cleaning_log = my_filled_log_no_deletion,
cleaning_log_uuid_column = "uuid",
cleaning_log_question_column = "question",
cleaning_log_new_value_column = "new_value",
cleaning_log_change_type_column = "change_type",
cleaning_log_old_value_column = "old_value",
deletion_log = my_deletion_log,
deletion_log_uuid_column = "uuid"
)Exercises
Exercise 1
- Export the cleaning log you have created previously.
previous_exercise_log <- readRDS("inputs/03 - exercise - previous_log.RDS")
previous_exercise_log %>% names()[1] "checked_dataset" "percentage_missing_log" "potential_PII"
[4] "logical_all"
Exercise 2
- Create the clean data from the raw dataset and the filled cleaning.
exercise_filled_log <- readxl::read_excel("inputs/04 - exercise - cleaning_log - filled.xlsx", sheet = "cleaning_log")Exercise 3
- Review the cleaning below, if there is someone else doing the exercise, you can try to review someone’s cleaning.
exercise3_clean_dataset <- readxl::read_excel("inputs/05 - exercise - clean dataset for review.xlsx")
exercise3_cleaning_log <- readxl::read_excel("inputs/05 - exercise - clean dataset for review.xlsx", sheet = 2)